In [38]:
import pandas as pd
import plotly.express as px
import math
In [39]:
data = pd.read_pickle('data/train.pkl')
In [4]:
changes_by_type = data.groupby(['bot', 'type']).size().reset_index().rename(columns={0: 'count'})
In [5]:
px.bar(changes_by_type, x='type', y='count', color='bot',
       title='Distribution of changes by type and user category',
       width=600, height=500,
       labels={'bot': 'User is bot', 'type': 'Change type', 'count': 'Count of changes'},
       color_discrete_sequence=px.colors.qualitative.Pastel)
In [6]:
data['bot_in_username'] = data['user'].str.contains('bot', case=False).astype(int)
In [7]:
data['bot_in_username'].mask(data['bot_in_username'] == 1, "Have 'bot'", inplace=True)
data['bot_in_username'].mask(data['bot_in_username'] == 0, "Don't have 'bot'", inplace=True)
In [8]:
bot_in_username = data.groupby(['bot', 'bot_in_username']).size().reset_index().rename(columns={0: 'count'})
In [9]:
px.bar(bot_in_username, x='bot_in_username', y='count', color='bot',
       title="Distribution by presence of 'bot' in the name and category",
       width=600, height=500,
       labels={'bot': 'User is bot', 'bot_in_username': "'bot' is in the user name", 'count': 'Count of changes'},
       color_discrete_sequence=px.colors.qualitative.Pastel)
In [10]:
data['comment_len'] = data['comment'].str.len()
In [11]:
data['comment_len_in50'] = (data['comment_len'] / 50).apply(math.ceil) * 50
In [12]:
# cut off extremes for legible visual
px.histogram(data[data['comment_len'] <= 300], x='comment_len', color='bot',
             title="Distribution by comment lengths and user category",
             width=600, height=500,
             labels={'bot': 'User is bot', 'comment_len': "Comment length", 'count': 'Count of changes'},
             color_discrete_sequence=px.colors.qualitative.Pastel)
In [13]:
data['revision_len'] = data['length.new'] - data['length.old']
data['revision_len'] = data['revision_len'].fillna(0)
In [14]:
data['revision_len_in10000'] = (data['revision_len'] / 100).apply(math.ceil) * 100
In [15]:
# cut off extremes
px.histogram((data[(data['revision_len']<7000) & (data['revision_len']>-7000)]), x='revision_len', color='bot',
             title="Distribution by revision lengths and user category",
             width=600, height=500,
             labels={'bot': 'User is bot', 'revision_len_in10000': "Revision length <", 'count': 'Count of changes'},
             color_discrete_sequence=px.colors.qualitative.Pastel)
In [41]:
data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')
In [17]:
# avg changes by user per minute
changes_per_min = data.groupby(['user', pd.Grouper(key='datetime', freq='min')]).size().reset_index()\
    .rename(columns={0:'changes_per_min', 'datetime': 'minute'})
avg_changes_per_min = changes_per_min.groupby('user').agg(avg_changes_per_min=('changes_per_min', 'mean')).reset_index()
data = pd.merge(data, avg_changes_per_min, how='left', on='user')
In [18]:
data['avg_changes_per_min_in10'] = (data['avg_changes_per_min'] / 10).apply(math.ceil) * 10
In [19]:
avg_changes_per_min = data.groupby(['bot', 'avg_changes_per_min_in10']).size().reset_index().rename(columns={0: 'count'})
In [20]:
px.histogram(data, x='avg_changes_per_min', color='bot',
             title="Distribution by number of changes per minute and user category",
             width=600, height=500,
             labels={'bot': 'User is bot', 'avg_changes_per_min': "Average number of changes per minute", 'count': 'Count of changes'},
             color_discrete_sequence=px.colors.qualitative.Pastel)
In [42]:
# avg changes by user per 2 seconds
changes_per_2s = data.groupby(['user', pd.Grouper(key='datetime', freq='2S')]).size().reset_index()\
    .rename(columns={0:'changes_per_2s', 'datetime': 'second'})
avg_changes_per_2s = changes_per_2s.groupby('user').agg(avg_changes_per_2s=('changes_per_2s', 'mean')).reset_index()
data = pd.merge(data, avg_changes_per_2s, how='left', on='user')
In [43]:
data['avg_changes_per_2s'] = data['avg_changes_per_2s'].apply(math.ceil)
In [45]:
# cut off extremes
px.histogram(data[data['avg_changes_per_2s']<=10], x='avg_changes_per_2s', color='bot',
       title="Distribution by number of changes per 2 sec and user category",
       width=600, height=500,
       labels={'bot': 'User is bot', 'avg_changes_per_2s': "Average number of changes per 2 seconds <", 'count': 'Count of changes'},
       color_discrete_sequence=px.colors.qualitative.Pastel)